# -*-Python-*-
# Bilion word language modeling benchmark from tfds
# tokenized on-the-fly using t2t vocabulary (requires custom ops)

import mesh_tensorflow.transformer.t2t_vocabulary

# Dataset
dataset_name = "lm1b"
utils.run.train_dataset_fn = @mesh_tensorflow.transformer.dataset.untokenized_tfds_dataset
dataset.untokenized_tfds_dataset.dataset_name = %dataset_name
dataset.untokenized_tfds_dataset.text2self = True

# Vocabulary
utils.run.vocabulary = @mesh_tensorflow.transformer.t2t_vocabulary.get_t2t_vocabulary()
t2t_vocabulary.get_t2t_vocabulary.vocabulary_filename = "vocab.translate_ende_wmt32k.32768.subwords"


# Model
run.model_type = "lm"
